Write a short description about the course and add a link to your github repository here. This is an R markdown (.Rmd) file so you can use R markdown syntax. See the ‘Useful links’ page in the mooc area (chapter 1) for instructions.
This is my Open Data Science course book,containing all the exercise solutions and also some random projects for extra practice.
analysis_data <- read.csv("~/Documents/GitHub/IODS-project/data/learning2014.csv",
sep=",", header=TRUE)
str(analysis_data)
## 'data.frame': 166 obs. of 7 variables:
## $ gender : Factor w/ 2 levels "F","M": 1 2 1 2 2 1 2 1 2 1 ...
## $ age : int 53 55 49 53 49 38 50 37 37 42 ...
## $ attitude: int 37 31 25 35 37 38 35 29 38 21 ...
## $ deep : num 0.97 0.789 0.947 0.947 0.992 ...
## $ stra : num 0.314 0.256 0.307 0.307 0.322 ...
## $ surf : num 0.925 1.134 0.806 0.806 1.015 ...
## $ points : int 25 12 24 10 22 21 21 31 24 26 ...
dim(analysis_data)
## [1] 166 7
summary(analysis_data$gender)
## F M
## 110 56
summary(analysis_data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 21.00 22.00 25.51 27.00 55.00
summary(analysis_data$attitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 26.00 32.00 31.43 37.00 50.00
summary(analysis_data$deep)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4284 0.9019 0.9921 0.9956 1.1049 1.3303
summary(analysis_data$stra)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1389 0.2923 0.3216 0.3227 0.3581 0.4312
summary(analysis_data$surf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5670 0.8655 1.0147 0.9981 1.1341 1.5519
summary(analysis_data$points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 19.00 23.00 22.72 27.75 33.00
library(GGally)
## Loading required package: ggplot2
library(ggplot2)
ggpairs(analysis_data, mapping = aes( col = gender, alpha = 0.2 ), lower = list(combo = wrap("facethist", bins = 20)))
my_model <- lm(points ~ attitude + age + stra, data = analysis_data)
summary(my_model)
##
## Call:
## lm(formula = points ~ attitude + age + stra, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.206 -3.430 0.204 3.979 10.950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.60773 3.38966 4.605 8.32e-06 ***
## attitude 0.35941 0.05696 6.310 2.56e-09 ***
## age -0.07716 0.05322 -1.450 0.149
## stra -6.87318 8.55576 -0.803 0.423
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.307 on 162 degrees of freedom
## Multiple R-squared: 0.2043, Adjusted R-squared: 0.1896
## F-statistic: 13.87 on 3 and 162 DF, p-value: 4.305e-08
my_model2 <- lm(points ~ attitude, data = analysis_data)
summary(my_model2)
##
## Call:
## lm(formula = points ~ attitude, data = analysis_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.9763 -3.2119 0.4339 4.1534 10.6645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.63715 1.83035 6.358 1.95e-09 ***
## attitude 0.35255 0.05674 6.214 4.12e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.32 on 164 degrees of freedom
## Multiple R-squared: 0.1906, Adjusted R-squared: 0.1856
## F-statistic: 38.61 on 1 and 164 DF, p-value: 4.119e-09
par(mfrow = c(2,2))
plot(my_model, which = c(1, 2, 5))
analysis_data_new <- analysis_data[-c(2, 4, 56), ]
my_model3 <- lm(points ~ attitude + age + stra, data = analysis_data_new)
summary(my_model3)
##
## Call:
## lm(formula = points ~ attitude + age + stra, data = analysis_data_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.8988 -3.4558 0.0807 3.8415 11.5969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.30158 3.21710 4.445 1.64e-05 ***
## attitude 0.38062 0.05399 7.049 5.18e-11 ***
## age 0.04040 0.05658 0.714 0.476
## stra -13.31399 8.20912 -1.622 0.107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.01 on 159 degrees of freedom
## Multiple R-squared: 0.2415, Adjusted R-squared: 0.2271
## F-statistic: 16.87 on 3 and 159 DF, p-value: 1.455e-09
library("readxl")
library("ggplot2")
library("data.table")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("stringr")
library("tidyr")
library("GGally")
library("ggplot2")
library("MASS")
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
analysis.data <- read.csv("~/Documents/GitHub/IODS-project/data/alc.csv",
sep=",", header=TRUE)
glimpse(analysis.data)
## Observations: 382
## Variables: 35
## $ school <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP,...
## $ sex <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F, ...
## $ age <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15,...
## $ address <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, ...
## $ famsize <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, G...
## $ Pstatus <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T, ...
## $ Medu <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, ...
## $ Fedu <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, ...
## $ Mjob <fct> at_home, at_home, at_home, health, other, services,...
## $ Fjob <fct> teacher, other, other, services, other, other, othe...
## $ reason <fct> course, course, other, home, home, reputation, home...
## $ nursery <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, ye...
## $ internet <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes,...
## $ guardian <fct> mother, father, mother, mother, father, mother, mot...
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, ...
## $ studytime <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, ...
## $ failures <int> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ schoolsup <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no, ...
## $ famsup <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes,...
## $ paid <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes, ...
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes,...
## $ higher <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ romantic <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no...
## $ famrel <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, ...
## $ freetime <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, ...
## $ goout <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, ...
## $ Dalc <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Walc <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, ...
## $ health <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, ...
## $ absences <int> 5, 3, 8, 1, 2, 8, 0, 4, 0, 0, 1, 2, 1, 1, 0, 5, 8, ...
## $ G1 <int> 2, 7, 10, 14, 8, 14, 12, 8, 16, 13, 12, 10, 13, 11,...
## $ G2 <int> 8, 8, 10, 14, 12, 14, 12, 9, 17, 14, 11, 12, 14, 11...
## $ G3 <int> 8, 8, 11, 14, 12, 14, 12, 10, 18, 14, 12, 12, 13, 1...
## $ alc_use <dbl> 1.0, 1.0, 2.5, 1.0, 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1...
## $ high_use <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FAL...
gather(analysis.data) %>% glimpse
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Observations: 13,370
## Variables: 2
## $ key <chr> "school", "school", "school", "school", "school", "schoo...
## $ value <chr> "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "GP", "G...
gather(analysis.data) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
library("dplyr")
chosen.data <- analysis.data[, c("age", "sex", "Pstatus", "famrel", "high_use")]
# Draw a bar plot of chosen variables
gather(chosen.data) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
# Draw a correlation plot of chosen variables
ggpairs(chosen.data, mapping = aes( col = sex, alpha = 0.2 ), lower = list(combo = wrap("facethist", bins = 20)))
# Get cross tabulation tables for variables
library(gmodels)
CrossTable(chosen.data$sex, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$sex | FALSE | TRUE | Row Total |
## ----------------|-----------|-----------|-----------|
## F | 156 | 42 | 198 |
## | 2.102 | 4.942 | |
## | 0.788 | 0.212 | 0.518 |
## | 0.582 | 0.368 | |
## | 0.408 | 0.110 | |
## ----------------|-----------|-----------|-----------|
## M | 112 | 72 | 184 |
## | 2.262 | 5.318 | |
## | 0.609 | 0.391 | 0.482 |
## | 0.418 | 0.632 | |
## | 0.293 | 0.188 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## ----------------|-----------|-----------|-----------|
##
##
CrossTable(chosen.data$age, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$age | FALSE | TRUE | Row Total |
## ----------------|-----------|-----------|-----------|
## 15 | 63 | 18 | 81 |
## | 0.671 | 1.576 | |
## | 0.778 | 0.222 | 0.212 |
## | 0.235 | 0.158 | |
## | 0.165 | 0.047 | |
## ----------------|-----------|-----------|-----------|
## 16 | 79 | 28 | 107 |
## | 0.206 | 0.484 | |
## | 0.738 | 0.262 | 0.280 |
## | 0.295 | 0.246 | |
## | 0.207 | 0.073 | |
## ----------------|-----------|-----------|-----------|
## 17 | 64 | 36 | 100 |
## | 0.540 | 1.270 | |
## | 0.640 | 0.360 | 0.262 |
## | 0.239 | 0.316 | |
## | 0.168 | 0.094 | |
## ----------------|-----------|-----------|-----------|
## 18 | 54 | 27 | 81 |
## | 0.141 | 0.331 | |
## | 0.667 | 0.333 | 0.212 |
## | 0.201 | 0.237 | |
## | 0.141 | 0.071 | |
## ----------------|-----------|-----------|-----------|
## 19 | 7 | 4 | 11 |
## | 0.067 | 0.157 | |
## | 0.636 | 0.364 | 0.029 |
## | 0.026 | 0.035 | |
## | 0.018 | 0.010 | |
## ----------------|-----------|-----------|-----------|
## 20 | 1 | 0 | 1 |
## | 0.127 | 0.298 | |
## | 1.000 | 0.000 | 0.003 |
## | 0.004 | 0.000 | |
## | 0.003 | 0.000 | |
## ----------------|-----------|-----------|-----------|
## 22 | 0 | 1 | 1 |
## | 0.702 | 1.649 | |
## | 0.000 | 1.000 | 0.003 |
## | 0.000 | 0.009 | |
## | 0.000 | 0.003 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## ----------------|-----------|-----------|-----------|
##
##
CrossTable(chosen.data$Pstatus, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$Pstatus | FALSE | TRUE | Row Total |
## --------------------|-----------|-----------|-----------|
## A | 26 | 12 | 38 |
## | 0.016 | 0.038 | |
## | 0.684 | 0.316 | 0.099 |
## | 0.097 | 0.105 | |
## | 0.068 | 0.031 | |
## --------------------|-----------|-----------|-----------|
## T | 242 | 102 | 344 |
## | 0.002 | 0.004 | |
## | 0.703 | 0.297 | 0.901 |
## | 0.903 | 0.895 | |
## | 0.634 | 0.267 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## --------------------|-----------|-----------|-----------|
##
##
CrossTable(chosen.data$famrel, chosen.data$high_use)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 382
##
##
## | chosen.data$high_use
## chosen.data$famrel | FALSE | TRUE | Row Total |
## -------------------|-----------|-----------|-----------|
## 1 | 6 | 2 | 8 |
## | 0.027 | 0.063 | |
## | 0.750 | 0.250 | 0.021 |
## | 0.022 | 0.018 | |
## | 0.016 | 0.005 | |
## -------------------|-----------|-----------|-----------|
## 2 | 10 | 9 | 19 |
## | 0.832 | 1.955 | |
## | 0.526 | 0.474 | 0.050 |
## | 0.037 | 0.079 | |
## | 0.026 | 0.024 | |
## -------------------|-----------|-----------|-----------|
## 3 | 39 | 25 | 64 |
## | 0.775 | 1.823 | |
## | 0.609 | 0.391 | 0.168 |
## | 0.146 | 0.219 | |
## | 0.102 | 0.065 | |
## -------------------|-----------|-----------|-----------|
## 4 | 135 | 54 | 189 |
## | 0.044 | 0.102 | |
## | 0.714 | 0.286 | 0.495 |
## | 0.504 | 0.474 | |
## | 0.353 | 0.141 | |
## -------------------|-----------|-----------|-----------|
## 5 | 78 | 24 | 102 |
## | 0.580 | 1.362 | |
## | 0.765 | 0.235 | 0.267 |
## | 0.291 | 0.211 | |
## | 0.204 | 0.063 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 268 | 114 | 382 |
## | 0.702 | 0.298 | |
## -------------------|-----------|-----------|-----------|
##
##
# Model with glm()
chosen.mod <- glm(high_use ~ Pstatus + age + sex + famrel, data = chosen.data, family = "binomial")
summary(chosen.mod)
##
## Call:
## glm(formula = high_use ~ Pstatus + age + sex + famrel, family = "binomial",
## data = chosen.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5439 -0.8479 -0.6640 1.2254 2.0504
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.88778 1.71500 -2.267 0.0234 *
## PstatusT -0.13249 0.38409 -0.345 0.7301
## age 0.23479 0.09865 2.380 0.0173 *
## sexM 0.94515 0.23601 4.005 6.21e-05 ***
## famrel -0.32119 0.12560 -2.557 0.0106 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 438.92 on 377 degrees of freedom
## AIC: 448.92
##
## Number of Fisher Scoring iterations: 4
# Compute odds ratios (OR)
chosen.mod.OR <- coef(chosen.mod) %>% exp
# Compute confidence intervals (CI)
confint(chosen.mod)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -7.29460682 -0.54975516
## PstatusT -0.86780384 0.64983706
## age 0.04291218 0.43089580
## sexM 0.48770904 1.41454481
## famrel -0.56968043 -0.07549712
chosen.mod.CI <- confint(chosen.mod) %>% exp
## Waiting for profiling to be done...
# print out the odds ratios with their confidence intervals
cbind(chosen.mod.OR, chosen.mod.CI)
## chosen.mod.OR 2.5 % 97.5 %
## (Intercept) 0.02049085 0.0006791919 0.5770911
## PstatusT 0.87590845 0.4198726450 1.9152287
## age 1.26464541 1.0438462211 1.5386352
## sexM 2.57320663 1.6285809238 4.1146131
## famrel 0.72528788 0.5657061903 0.9272824
# predict() the probability of high_use
probabilities <- predict(chosen.mod, type = "response")
# add the predicted probabilities to 'alc'
chosen.data <- mutate(chosen.data, probability = probabilities)
# use the probabilities to make a prediction of high_use
chosen.data <- mutate(chosen.data, prediction = probabilities > 0.5)
# tabulate the target variable versus the predictions
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 259 9
## TRUE 98 16
# initialize a plot of 'high_use' versus 'probability' in 'alc'
g <- ggplot(chosen.data, aes(x = probability, y = high_use, col = prediction))
# define the geom as points and draw the plot
g + geom_point()
# tabulate the target variable versus the predictions
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 259 9
## TRUE 98 16
# Adjust the code: Use %>% to apply the prop.table() function on the output of table()
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction) %>% prop.table()
## prediction
## high_use FALSE TRUE
## FALSE 0.67801047 0.02356021
## TRUE 0.25654450 0.04188482
#Adjust the code: Use %>% to apply the addmargins() function on the output of prop.table()
table(high_use = chosen.data$high_use, prediction = chosen.data$prediction) %>% prop.table() %>% addmargins()
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.67801047 0.02356021 0.70157068
## TRUE 0.25654450 0.04188482 0.29842932
## Sum 0.93455497 0.06544503 1.00000000
# define a loss function (mean prediction error)
loss_func <- function(class, prob) {
n_wrong <- abs(class - prob) > 0.5
mean(n_wrong)
}
# call loss_func to compute the average number of wrong predictions in the (training) data using the above function
loss_func(class = chosen.data$high_use, prob = chosen.data$probability)
## [1] 0.2801047
# Calculate missclassification error using an R package to confirm the above results
library(InformationValue)
optCutOff <- optimalCutoff(chosen.data$high_use, chosen.data$prediction)[1]
misClassError(chosen.data$high_use, chosen.data$prediction, threshold = optCutOff)
## [1] 0.2801
# Calculating concordance
Concordance(chosen.data$high_use, chosen.data$prediction)
## $Concordance
## [1] 0.1356376
##
## $Discordance
## [1] 0.8643624
##
## $Tied
## [1] -1.110223e-16
##
## $Pairs
## [1] 30552
# Calculating sensitivity/specificty of our model
sensitivity(chosen.data$high_use, chosen.data$prediction, threshold = optCutOff)
## [1] 0.1403509
specificity(chosen.data$high_use, chosen.data$prediction, threshold = optCutOff)
## [1] 0.9664179
# Construct an ROC curve
library(plotROC)
plotROC(chosen.data$high_use, chosen.data$prediction)
## K-fold cross-validation
library(boot)
K <- nrow(chosen.data) #defines the leave-one-out method
cv_chosen <- cv.glm(data = chosen.data, cost = loss_func, glmfit = chosen.mod, K = 10)
## average number of wrong predictions in the cross validation
cv_chosen$delta[1]
## [1] 0.2905759
What we see above is that the average predicton error from the above model is 0.31 which is much higher than 0.26. Hence, the model explored in Datacamp is better for predictive power of alcohol use than this one explored here.
## Define big model to compare.
big.model <- glm(high_use ~ school + sex + age + address + famsize + Pstatus + G3 + failures + famrel + famsup + freetime + goout + schoolsup + absences + health + Medu + Fedu +
activities + paid, data = analysis.data, family = binomial(link = "logit"))
## Define null model to compare.
null.model <- glm(high_use ~ 1, data = analysis.data, family = binomial(link = "logit"))
## Determining model with step procedure
step.model <- step(null.model, scope = list(upper = big.model), direction = "both",
test = "Chisq", data = analysis.data)
## Start: AIC=467.68
## high_use ~ 1
##
## Df Deviance AIC LRT Pr(>Chi)
## + goout 1 415.68 419.68 50.001 1.537e-12 ***
## + absences 1 447.45 451.45 18.233 1.955e-05 ***
## + sex 1 450.95 454.95 14.732 0.0001239 ***
## + freetime 1 456.84 460.84 8.840 0.0029469 **
## + failures 1 456.98 460.98 8.698 0.0031864 **
## + G3 1 459.43 463.43 6.252 0.0124059 *
## + age 1 460.83 464.83 4.851 0.0276320 *
## + famrel 1 460.92 464.92 4.756 0.0292035 *
## + address 1 462.30 466.30 3.375 0.0661994 .
## + famsize 1 463.48 467.48 2.200 0.1380308
## <none> 465.68 467.68
## + health 1 464.29 468.29 1.389 0.2385700
## + activities 1 464.43 468.43 1.245 0.2645257
## + schoolsup 1 464.51 468.51 1.165 0.2803902
## + paid 1 464.80 468.80 0.877 0.3491564
## + school 1 465.13 469.13 0.553 0.4572172
## + famsup 1 465.19 469.19 0.485 0.4860956
## + Fedu 1 465.46 469.46 0.215 0.6427752
## + Pstatus 1 465.62 469.62 0.060 0.8062405
## + Medu 1 465.63 469.63 0.046 0.8298479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=419.68
## high_use ~ goout
##
## Df Deviance AIC LRT Pr(>Chi)
## + absences 1 402.50 408.50 13.175 0.0002837 ***
## + sex 1 402.74 408.74 12.935 0.0003224 ***
## + famrel 1 408.12 414.12 7.562 0.0059609 **
## + address 1 409.72 415.72 5.960 0.0146306 *
## + failures 1 410.92 416.92 4.753 0.0292404 *
## + G3 1 413.39 419.39 2.293 0.1299925
## + health 1 413.42 419.42 2.263 0.1324990
## + famsize 1 413.51 419.51 2.166 0.1411078
## <none> 415.68 419.68
## + activities 1 413.68 419.68 2.000 0.1573155
## + age 1 414.38 420.38 1.299 0.2543646
## + paid 1 414.53 420.53 1.147 0.2840893
## + freetime 1 414.75 420.75 0.931 0.3345969
## + schoolsup 1 414.77 420.77 0.910 0.3402438
## + school 1 414.79 420.79 0.886 0.3464949
## + famsup 1 415.36 421.36 0.314 0.5753984
## + Pstatus 1 415.54 421.54 0.142 0.7065896
## + Fedu 1 415.57 421.57 0.111 0.7387151
## + Medu 1 415.64 421.64 0.035 0.8522261
## - goout 1 465.68 467.68 50.001 1.537e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=408.5
## high_use ~ goout + absences
##
## Df Deviance AIC LRT Pr(>Chi)
## + sex 1 387.75 395.75 14.748 0.0001229 ***
## + famrel 1 395.79 403.79 6.716 0.0095572 **
## + address 1 397.09 405.09 5.413 0.0199833 *
## + failures 1 398.40 406.40 4.107 0.0427168 *
## + health 1 400.08 408.08 2.427 0.1192936
## + activities 1 400.24 408.24 2.262 0.1325869
## <none> 402.50 408.50
## + famsize 1 400.54 408.54 1.962 0.1613260
## + G3 1 400.92 408.92 1.583 0.2083813
## + paid 1 400.92 408.92 1.582 0.2084805
## + school 1 400.98 408.98 1.526 0.2166950
## + freetime 1 401.21 409.21 1.288 0.2563846
## + schoolsup 1 401.50 409.50 1.003 0.3164808
## + age 1 401.95 409.95 0.550 0.4581523
## + famsup 1 402.06 410.06 0.446 0.5044225
## + Medu 1 402.25 410.25 0.254 0.6144158
## + Fedu 1 402.47 410.47 0.035 0.8512142
## + Pstatus 1 402.50 410.50 0.006 0.9380858
## - absences 1 415.68 419.68 13.175 0.0002837 ***
## - goout 1 447.45 451.45 44.943 2.028e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=395.75
## high_use ~ goout + absences + sex
##
## Df Deviance AIC LRT Pr(>Chi)
## + famrel 1 379.81 389.81 7.946 0.0048195 **
## + address 1 382.70 392.70 5.058 0.0245093 *
## + activities 1 383.70 393.70 4.051 0.0441470 *
## + paid 1 384.50 394.50 3.255 0.0711919 .
## + failures 1 385.06 395.06 2.693 0.1008032
## <none> 387.75 395.75
## + school 1 385.95 395.95 1.803 0.1793763
## + G3 1 386.44 396.44 1.312 0.2520448
## + famsize 1 386.58 396.58 1.176 0.2781516
## + health 1 386.77 396.77 0.985 0.3209508
## + Medu 1 387.02 397.02 0.731 0.3926563
## + age 1 387.09 397.09 0.663 0.4156185
## + freetime 1 387.47 397.47 0.280 0.5964214
## + schoolsup 1 387.61 397.61 0.143 0.7050365
## + famsup 1 387.74 397.74 0.013 0.9095974
## + Fedu 1 387.75 397.75 0.000 0.9944014
## + Pstatus 1 387.75 397.75 0.000 0.9949057
## - sex 1 402.50 408.50 14.748 0.0001229 ***
## - absences 1 402.74 408.74 14.988 0.0001082 ***
## - goout 1 430.07 436.07 42.314 7.773e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=389.81
## high_use ~ goout + absences + sex + famrel
##
## Df Deviance AIC LRT Pr(>Chi)
## + address 1 374.76 386.76 5.044 0.0247052 *
## + activities 1 376.23 388.23 3.581 0.0584394 .
## + paid 1 376.64 388.64 3.168 0.0750934 .
## + failures 1 377.79 389.79 2.020 0.1552693
## <none> 379.81 389.81
## + health 1 377.91 389.91 1.904 0.1676831
## + school 1 378.55 390.55 1.261 0.2615349
## + G3 1 378.81 390.81 1.000 0.3172697
## + famsize 1 378.89 390.89 0.921 0.3372367
## + age 1 378.93 390.93 0.883 0.3473595
## + Medu 1 378.97 390.97 0.835 0.3607934
## + freetime 1 379.07 391.07 0.738 0.3902071
## + schoolsup 1 379.69 391.69 0.122 0.7271839
## + famsup 1 379.77 391.77 0.042 0.8367076
## + Pstatus 1 379.80 391.80 0.013 0.9084113
## + Fedu 1 379.80 391.80 0.005 0.9461156
## - famrel 1 387.75 395.75 7.946 0.0048195 **
## - absences 1 393.80 401.80 13.993 0.0001835 ***
## - sex 1 395.79 403.79 15.979 6.406e-05 ***
## - goout 1 424.86 432.86 45.048 1.923e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=386.76
## high_use ~ goout + absences + sex + famrel + address
##
## Df Deviance AIC LRT Pr(>Chi)
## + activities 1 370.67 384.67 4.094 0.0430315 *
## + paid 1 370.92 384.92 3.844 0.0499390 *
## <none> 374.76 386.76
## + health 1 373.08 387.08 1.685 0.1942380
## + failures 1 373.09 387.09 1.670 0.1962137
## + famsize 1 373.41 387.41 1.353 0.2447313
## + freetime 1 373.99 387.99 0.773 0.3793315
## + G3 1 374.37 388.37 0.392 0.5311653
## + Medu 1 374.46 388.46 0.301 0.5835271
## + age 1 374.47 388.47 0.292 0.5889660
## + school 1 374.49 388.49 0.275 0.6000857
## + schoolsup 1 374.70 388.70 0.063 0.8020037
## + famsup 1 374.73 388.73 0.032 0.8586738
## + Fedu 1 374.74 388.74 0.025 0.8739745
## + Pstatus 1 374.76 388.76 0.000 0.9993591
## - address 1 379.81 389.81 5.044 0.0247052 *
## - famrel 1 382.70 392.70 7.932 0.0048564 **
## - absences 1 388.50 398.50 13.738 0.0002101 ***
## - sex 1 390.30 400.30 15.539 8.084e-05 ***
## - goout 1 422.01 432.01 47.242 6.273e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=384.67
## high_use ~ goout + absences + sex + famrel + address + activities
##
## Df Deviance AIC LRT Pr(>Chi)
## + paid 1 366.49 382.49 4.185 0.0407904 *
## <none> 370.67 384.67
## + health 1 369.12 385.12 1.547 0.2135619
## + failures 1 369.46 385.46 1.214 0.2704852
## + famsize 1 369.53 385.53 1.143 0.2850968
## + freetime 1 369.78 385.78 0.888 0.3460988
## + age 1 370.48 386.48 0.191 0.6621013
## + Fedu 1 370.52 386.52 0.153 0.6960310
## + G3 1 370.54 386.54 0.134 0.7142250
## + school 1 370.55 386.55 0.124 0.7246465
## + Medu 1 370.56 386.56 0.108 0.7421703
## + Pstatus 1 370.61 386.61 0.063 0.8024291
## + famsup 1 370.65 386.65 0.023 0.8805015
## + schoolsup 1 370.65 386.65 0.019 0.8899358
## - activities 1 374.76 386.76 4.094 0.0430315 *
## - address 1 376.23 388.23 5.557 0.0184020 *
## - famrel 1 378.14 390.14 7.466 0.0062860 **
## - absences 1 384.72 396.72 14.051 0.0001779 ***
## - sex 1 387.95 399.95 17.275 3.234e-05 ***
## - goout 1 419.34 431.34 48.665 3.037e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=382.49
## high_use ~ goout + absences + sex + famrel + address + activities +
## paid
##
## Df Deviance AIC LRT Pr(>Chi)
## + failures 1 364.29 382.29 2.198 0.1381945
## <none> 366.49 382.49
## + health 1 364.50 382.50 1.983 0.1590665
## + famsize 1 365.31 383.31 1.180 0.2772746
## + freetime 1 365.43 383.43 1.051 0.3052950
## + famsup 1 366.10 384.10 0.381 0.5368105
## + G3 1 366.11 384.11 0.372 0.5416643
## + Medu 1 366.12 384.12 0.367 0.5444822
## + age 1 366.33 384.33 0.156 0.6931483
## + school 1 366.42 384.42 0.068 0.7936917
## + Fedu 1 366.42 384.42 0.061 0.8047582
## + Pstatus 1 366.47 384.47 0.011 0.9173748
## + schoolsup 1 366.48 384.48 0.003 0.9594304
## - paid 1 370.67 384.67 4.185 0.0407904 *
## - activities 1 370.92 384.92 4.435 0.0352018 *
## - address 1 372.90 386.90 6.409 0.0113517 *
## - famrel 1 374.01 388.01 7.523 0.0060923 **
## - absences 1 381.53 395.53 15.046 0.0001049 ***
## - sex 1 386.10 400.10 19.612 9.487e-06 ***
## - goout 1 415.98 429.98 49.499 1.985e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=382.29
## high_use ~ goout + absences + sex + famrel + address + activities +
## paid + failures
##
## Df Deviance AIC LRT Pr(>Chi)
## <none> 364.29 382.29
## - failures 1 366.49 382.49 2.198 0.1381945
## + health 1 362.70 382.70 1.585 0.2080158
## + famsize 1 363.01 383.01 1.278 0.2582321
## + freetime 1 363.30 383.30 0.983 0.3214915
## + Fedu 1 363.80 383.80 0.484 0.4866195
## + famsup 1 363.87 383.87 0.415 0.5195099
## - activities 1 368.10 384.10 3.817 0.0507355 .
## + school 1 364.20 384.20 0.087 0.7678936
## + Medu 1 364.24 384.24 0.049 0.8248779
## + age 1 364.25 384.25 0.034 0.8532051
## + G3 1 364.27 384.27 0.020 0.8881974
## + schoolsup 1 364.28 384.28 0.005 0.9437839
## + Pstatus 1 364.29 384.29 0.000 0.9839522
## - paid 1 369.46 385.46 5.168 0.0230019 *
## - address 1 370.26 386.26 5.975 0.0145064 *
## - famrel 1 371.11 387.11 6.826 0.0089854 **
## - absences 1 378.63 394.63 14.345 0.0001522 ***
## - sex 1 382.40 398.40 18.113 2.081e-05 ***
## - goout 1 409.81 425.81 45.518 1.512e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final.model.coeff <- as.data.frame(step.model$coefficients)
final.mod <- glm(high_use ~ sex + address + failures + famrel + goout + absences +
activities + paid, data = analysis.data, family = binomial(link = "logit"))
summary(final.mod)
##
## Call:
## glm(formula = high_use ~ sex + address + failures + famrel +
## goout + absences + activities + paid, family = binomial(link = "logit"),
## data = analysis.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8829 -0.7374 -0.4707 0.6885 2.8657
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.49941 0.73483 -3.401 0.000671 ***
## sexM 1.13462 0.27425 4.137 3.52e-05 ***
## addressU -0.77509 0.31595 -2.453 0.014157 *
## failures 0.32272 0.21815 1.479 0.139051
## famrel -0.37983 0.14566 -2.608 0.009119 **
## goout 0.80068 0.12880 6.217 5.08e-10 ***
## absences 0.08401 0.02218 3.788 0.000152 ***
## activitiesyes -0.51728 0.26676 -1.939 0.052487 .
## paidyes 0.61261 0.27244 2.249 0.024536 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 364.29 on 373 degrees of freedom
## AIC: 382.29
##
## Number of Fisher Scoring iterations: 5
# Get final model data
final.data <- analysis.data[, c("goout", "absences", "sex", "famrel", "address",
"activities", "paid", "failures", "high_use")]
# predict() the probability of high_use
probabilities.final.mod <- predict(final.mod, type = "response")
# add the predicted probabilities to 'alc'
final.data <- mutate(final.data, probability = probabilities.final.mod)
# use the probabilities to make a prediction of high_use
final.data <- mutate(final.data, prediction = probabilities.final.mod > 0.5)
# tabulate the target variable versus the predictions
table(high_use = final.data$high_use, prediction = final.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 252 16
## TRUE 59 55
# initialize a plot of 'high_use' versus 'probability' in 'alc'
g_final <- ggplot(final.data, aes(x = probability, y = high_use, col = prediction))
# define the geom as points and draw the plot
g_final + geom_point()
# tabulate the target variable versus the predictions
table(high_use = final.data$high_use, prediction = final.data$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 252 16
## TRUE 59 55
# Adjust the code: Use %>% to apply the prop.table() function on the output of table()
table(high_use = final.data$high_use, prediction = final.data$prediction) %>% prop.table()
## prediction
## high_use FALSE TRUE
## FALSE 0.65968586 0.04188482
## TRUE 0.15445026 0.14397906
#Adjust the code: Use %>% to apply the addmargins() function on the output of prop.table()
table(high_use = final.data$high_use, prediction = final.data$prediction) %>% prop.table() %>% addmargins()
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.65968586 0.04188482 0.70157068
## TRUE 0.15445026 0.14397906 0.29842932
## Sum 0.81413613 0.18586387 1.00000000
# define a loss function (mean prediction error)
loss_func <- function(class, prob) {
n_wrong <- abs(class - prob) > 0.5
mean(n_wrong)
}
# call loss_func to compute the average number of wrong predictions in the (training) data using the above function
loss_func(class = final.data$high_use, prob = final.data$probability)
## [1] 0.1963351
# Calculate missclassification error using an R package to confirm the above results
library(InformationValue)
optCutOff_final <- optimalCutoff(final.data$high_use, final.data$prediction)[1]
misClassError(final.data$high_use, final.data$prediction, threshold = optCutOff_final)
## [1] 0.1963
# Calculating concordance
Concordance(final.data$high_use, final.data$prediction)
## $Concordance
## [1] 0.4536528
##
## $Discordance
## [1] 0.5463472
##
## $Tied
## [1] 0
##
## $Pairs
## [1] 30552
# Calculating sensitivity/specificty of our model
sensitivity(final.data$high_use, final.data$prediction, threshold = optCutOff)
## [1] 0.4824561
specificity(final.data$high_use, final.data$prediction, threshold = optCutOff)
## [1] 0.9402985
# Construct an ROC curve
library(plotROC)
# ROC for Final chosen model
plotROC(final.data$high_use, final.data$prediction)
# ROC for initially chosen model
plotROC(chosen.data$high_use, chosen.data$prediction)
## K-fold cross-validation
library(boot)
K_final <- nrow(final.data) #defines the leave-one-out method
cv_final <- cv.glm(data = final.data, cost = loss_func, glmfit = final.mod, K = 10)
## average number of wrong predictions in the cross validation
cv_final$delta[1]
## [1] 0.1963351
library("readxl")
library("ggplot2")
library("data.table")
library("dplyr")
library("stringr")
library("tidyr")
library("GGally")
library("ggplot2")
library("MASS")
library("corrplot")
## corrplot 0.84 loaded
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data("Boston")
glimpse(Boston)
## Observations: 506
## Variables: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, ...
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5,...
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, ...
## $ chas <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524...
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172...
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0,...
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605...
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, ...
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311,...
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, ...
## $ black <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60...
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.9...
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, ...
plot(Boston$med)
# General summary of the dataset
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
# Matrix of the variables
pairs(Boston)
# Correlation matrix
cor_matrix <- cor(Boston) %>% round(2)
corrplot(cor_matrix, method = "circle", type = "upper",
cl.pos = "b", tl.pos = "d", tl.cex = 0.6)
# How do other variables stack up against crime rates? Do we see patterns?
molten <- melt(Boston, id = "crim")
ggplot(molten, aes(x = value, y = crim))+
facet_wrap( ~ variable, scales = "free")+
geom_point()
# Centering and standardizing variables
boston_scaled <- scale(Boston)
# Summaries of the scaled variables
glimpse(boston_scaled)
## num [1:506, 1:14] -0.419 -0.417 -0.417 -0.416 -0.412 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:506] "1" "2" "3" "4" ...
## ..$ : chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:center")= Named num [1:14] 3.6135 11.3636 11.1368 0.0692 0.5547 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:scale")= Named num [1:14] 8.602 23.322 6.86 0.254 0.116 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
summary(boston_scaled)
## crim zn indus
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668
## Median :-0.390280 Median :-0.48724 Median :-0.2109
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202
## chas nox rm age
## Min. :-0.2723 Min. :-1.4644 Min. :-3.8764 Min. :-2.3331
## 1st Qu.:-0.2723 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366
## Median :-0.2723 Median :-0.1441 Median :-0.1084 Median : 0.3171
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2723 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059
## Max. : 3.6648 Max. : 2.7296 Max. : 3.5515 Max. : 1.1164
## dis rad tax ptratio
## Min. :-1.2658 Min. :-0.9819 Min. :-1.3127 Min. :-2.7047
## 1st Qu.:-0.8049 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876
## Median :-0.2790 Median :-0.5225 Median :-0.4642 Median : 0.2746
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6617 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058
## Max. : 3.9566 Max. : 1.6596 Max. : 1.7964 Max. : 1.6372
## black lstat medv
## Min. :-3.9033 Min. :-1.5296 Min. :-1.9063
## 1st Qu.: 0.2049 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median : 0.3808 Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4332 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 0.4406 Max. : 3.5453 Max. : 2.9865
# Class of boston_scaled object
class(boston_scaled)
## [1] "matrix"
# Converting to data frame
boston_scaled <- as.data.frame(boston_scaled)
# Summary of the scaled crime rate
summary(Boston$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
# Quantile vector of 'crim'
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
# Categorical variable 'crime' from 'crim'
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, label = c("low", "med_low", "med_high", "high"))
# Tabulation of the new factor crime
table(crime)
## crime
## low med_low med_high high
## 127 126 126 127
# Removing original 'crim' from the dataset
boston_scaled <- dplyr::select(boston_scaled, -crim)
# Adding the new categorical value to scaled data
boston_scaled <- data.frame(boston_scaled, crime)
# Number of rows in the Boston dataset
n <- nrow(boston_scaled)
n
## [1] 506
# Choosing randomly 80% of the rows
ind <- sample(n, size = n * 0.8)
ind
## [1] 389 262 73 371 140 195 131 127 446 338 329 100 488 311 249 236 130
## [18] 25 164 26 431 419 204 229 428 407 441 244 479 55 336 227 95 291
## [35] 208 47 366 184 170 494 113 62 16 112 266 280 361 154 385 109 202
## [52] 132 268 122 222 121 22 278 111 396 15 189 493 400 424 13 350 212
## [69] 504 191 230 443 42 402 86 226 218 125 243 46 374 327 294 272 486
## [86] 155 322 421 223 83 412 352 257 115 85 367 418 405 153 340 465 343
## [103] 145 211 459 265 401 180 380 403 7 196 307 120 118 221 94 98 496
## [120] 185 96 78 162 499 341 482 372 451 82 289 44 365 193 256 285 362
## [137] 142 339 436 65 319 325 206 20 190 470 332 357 119 54 209 228 495
## [154] 284 168 133 247 2 252 415 287 254 106 192 323 176 330 490 342 276
## [171] 61 414 404 425 165 158 67 71 107 41 351 275 455 233 58 187 99
## [188] 475 215 473 379 333 24 108 259 458 321 205 124 447 146 251 64 214
## [205] 92 161 51 476 59 437 235 293 102 72 399 506 160 286 406 300 248
## [222] 123 203 45 453 3 422 167 318 36 76 375 182 497 18 368 397 364
## [239] 258 159 483 438 271 492 468 12 37 88 198 43 461 14 70 17 347
## [256] 290 91 1 296 427 363 93 152 126 255 449 477 335 312 304 103 32
## [273] 101 439 384 10 263 317 242 19 87 177 358 246 316 313 423 75 464
## [290] 377 320 66 150 388 240 116 466 489 474 9 416 4 213 139 34 450
## [307] 273 429 373 250 430 186 505 344 398 28 40 217 378 149 448 387 359
## [324] 210 104 178 68 409 472 331 315 234 269 355 381 480 74 144 282 386
## [341] 60 105 426 487 81 136 114 394 411 503 238 292 49 444 337 173 84
## [358] 469 454 30 346 5 478 38 156 360 11 467 169 128 69 27 356 237
## [375] 445 194 241 270 456 442 135 408 225 376 23 345 354 348 303 261 239
## [392] 134 392 298 77 277 79 52 501 324 283 89 175 471
# Training set
train <- boston_scaled[ind,]
# Test set
test <- boston_scaled[-ind,]
# Saving correct classes from test data
correct_classes <- test$crime
# Removing 'crime' variable from test data
test <- dplyr::select(test, -crime)
# Linear discriminant analysis
lda.fit <- lda(crime ~., data = train)
# lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## low med_low med_high high
## 0.2475248 0.2599010 0.2351485 0.2574257
##
## Group means:
## zn indus chas nox rm
## low 0.87711030 -0.8799953 -0.07547406 -0.8585438 0.3782652
## med_low -0.06010224 -0.2811903 -0.08484810 -0.5745863 -0.1374265
## med_high -0.37711365 0.1324407 0.26643202 0.3248407 0.1823968
## high -0.48724019 1.0170690 -0.04518867 1.0656723 -0.4400166
## age dis rad tax ptratio
## low -0.8769058 0.8373884 -0.7108330 -0.7500256 -0.36653744
## med_low -0.3740537 0.3495078 -0.5476413 -0.4609922 -0.05908442
## med_high 0.3282864 -0.3313094 -0.4197268 -0.3452952 -0.26341099
## high 0.8126474 -0.8562799 1.6386213 1.5144083 0.78135074
## black lstat medv
## low 0.3741598 -0.74741875 0.48560869
## med_low 0.3152628 -0.14011741 -0.01330094
## med_high 0.1324554 -0.06854503 0.28176306
## high -0.7411392 0.92438237 -0.73257995
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.08730375 0.67588503 -0.92706372
## indus 0.07552247 -0.24089491 0.45232155
## chas -0.09771443 -0.09565482 -0.02112186
## nox 0.34916039 -0.87431947 -1.40591647
## rm -0.14469267 -0.10552388 -0.11131743
## age 0.15527438 -0.28021976 -0.07168076
## dis -0.06829618 -0.29160233 0.11451008
## rad 3.44793448 0.82182130 -0.22959177
## tax -0.02742408 0.11506358 0.63725904
## ptratio 0.07427527 -0.01121120 -0.38048469
## black -0.10736717 0.03939212 0.12549197
## lstat 0.22558539 -0.25481146 0.36480263
## medv 0.19499455 -0.52799945 -0.22360248
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9591 0.0302 0.0107
# Function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
classes <- as.numeric(train$crime)
# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 2)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
tab <- table(correct = correct_classes, predicted = lda.pred$class)
tab
## predicted
## correct low med_low med_high high
## low 19 7 1 0
## med_low 1 15 5 0
## med_high 0 7 22 2
## high 0 0 0 23
pred1 <- rbind(tab[1, ]/sum(tab[1, ]), tab[2, ]/sum(tab[2, ]))
pred2 <- rbind(tab[3, ]/sum(tab[3, ]), tab[4, ]/sum(tab[4, ]))
Predict_accuracy <- rbind(pred1, pred2)
rownames(Predict_accuracy) <- colnames(Predict_accuracy)
Predict_accuracy
## low med_low med_high high
## low 0.70370370 0.2592593 0.03703704 0.00000000
## med_low 0.04761905 0.7142857 0.23809524 0.00000000
## med_high 0.00000000 0.2258065 0.70967742 0.06451613
## high 0.00000000 0.0000000 0.00000000 1.00000000
require(caret)
## Loading required package: caret
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following objects are masked from 'package:InformationValue':
##
## confusionMatrix, precision, sensitivity, specificity
confusionMatrix(correct_classes, lda.pred$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction low med_low med_high high
## low 19 7 1 0
## med_low 1 15 5 0
## med_high 0 7 22 2
## high 0 0 0 23
##
## Overall Statistics
##
## Accuracy : 0.7745
## 95% CI : (0.6811, 0.8514)
## No Information Rate : 0.2843
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6997
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: low Class: med_low Class: med_high Class: high
## Sensitivity 0.9500 0.5172 0.7857 0.9200
## Specificity 0.9024 0.9178 0.8784 1.0000
## Pos Pred Value 0.7037 0.7143 0.7097 1.0000
## Neg Pred Value 0.9867 0.8272 0.9155 0.9747
## Prevalence 0.1961 0.2843 0.2745 0.2451
## Detection Rate 0.1863 0.1471 0.2157 0.2255
## Detection Prevalence 0.2647 0.2059 0.3039 0.2255
## Balanced Accuracy 0.9262 0.7175 0.8320 0.9600
# Euclidean distance matrix
boston_scaled <- dplyr::select(boston_scaled, -crime)
dist_eu <- dist(boston_scaled)
# Summary of the distances
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.3984 4.7296 4.7597 6.0150 12.5315
# Manhattan distance matrix
dist_man <- dist(boston_scaled, method = 'manhattan')
# Summary of the distances
summary(dist_man)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2645 8.2073 12.0993 12.8752 16.8027 43.5452
# k-means clustering with 4
km4 <-kmeans(boston_scaled, centers = 4)
# plot the Boston dataset with clusters
# For this, we choose 5 variables - dis, medv, black, lstat and tax
pairs(boston_scaled[c("dis", "medv", "black", "lstat", "tax")], col = km4$cluster)
# k-means clustering with 3
km3 <-kmeans(boston_scaled, centers = 3)
# plot the Boston dataset with clusters
# For this, we choose 5 variables - dis, medv, black, lstat and tax
pairs(boston_scaled[c("dis", "medv", "black", "lstat", "tax")], col = km3$cluster)
set.seed(100)
# Compute and plot cluster addition & variance explained for k = 2 to k = 15.
k.max <- 15
data <- boston_scaled
clust_TSS <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
clust_TSS
## [1] 6565.000 4207.600 3519.743 3098.744 2746.623 2399.515 2143.440
## [8] 1955.816 1832.813 1736.480 1637.171 1561.280 1498.560 1464.093
## [15] 1385.043
plot(1:k.max, clust_TSS,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
# k-means clustering with 4
km_bonus <-kmeans(boston_scaled, centers = 4)
# Linear discriminant analysis with clusters from k-means as target
mat <- as.matrix(km_bonus$cluster)
cluster_target <- mat[ rownames(mat) %in% rownames(train), ]
lda.fit.bonus <- lda(cluster_target ~., data = train)
# target classes as numeric
classes <- as.numeric(cluster_target)
# Plot the lda results
plot(lda.fit.bonus, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 2)
Run the code below for the (scaled) train data that you used to fit the LDA. The code creates a matrix product, which is a projection of the data points.
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
# Plot with crime class in train
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type = 'scatter3d', mode = 'markers', color = train$crime)
# Plot with k-means clusters
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type = 'scatter3d', mode = 'markers', color = cluster_target)